{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Métricas de Avaliação"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### - Para problemas de Classificação"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary Stats\n",
      "Precision = 0.862068965517\n",
      "Recall = 0.862068965517\n",
      "F1 Score = 0.862068965517\n",
      "Class 0.0 precision = 0.809523809524\n",
      "Class 0.0 recall = 0.809523809524\n",
      "Class 0.0 F1 Measure = 0.809523809524\n",
      "Class 1.0 precision = 1.0\n",
      "Class 1.0 recall = 1.0\n",
      "Class 1.0 F1 Measure = 1.0\n",
      "Class 2.0 precision = 0.789473684211\n",
      "Class 2.0 recall = 0.789473684211\n",
      "Class 2.0 F1 Measure = 0.789473684211\n"
     ]
    }
   ],
   "source": [
    "from pyspark.mllib.classification import LogisticRegressionWithLBFGS\n",
    "from pyspark.mllib.util import MLUtils\n",
    "from pyspark.mllib.evaluation import MulticlassMetrics\n",
    "\n",
    "# Carrega os dados\n",
    "data = MLUtils.loadLibSVMFile(sc, \"data/sample_multiclass_classification_data.txt\")\n",
    "\n",
    "# Divide o conjunto de dados em treino (60%) e teste (40%)\n",
    "training, test = data.randomSplit([0.6, 0.4], seed=11L)\n",
    "training.cache()\n",
    "\n",
    "# Executa o processo de treino para construção do modelo\n",
    "model = LogisticRegressionWithLBFGS.train(training, numClasses=3)\n",
    "\n",
    "# Faz as predições\n",
    "predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))\n",
    "\n",
    "# Instancia objeto que possui as metricas necessárias\n",
    "metrics = MulticlassMetrics(predictionAndLabels)\n",
    "\n",
    "# Overall statistics\n",
    "precision = metrics.precision()\n",
    "recall = metrics.recall()\n",
    "f1Score = metrics.fMeasure()\n",
    "print(\"Summary Stats\")\n",
    "print(\"Precision = %s\" % precision)\n",
    "print(\"Recall = %s\" % recall)\n",
    "print(\"F1 Score = %s\" % f1Score)\n",
    "\n",
    "# Resultados por classe\n",
    "labels = data.map(lambda lp: lp.label).distinct().collect()\n",
    "for label in sorted(labels):\n",
    "    print(\"Class %s precision = %s\" % (label, metrics.precision(label)))\n",
    "    print(\"Class %s recall = %s\" % (label, metrics.recall(label)))\n",
    "    print(\"Class %s F1 Measure = %s\" % (label, metrics.fMeasure(label, beta=1.0)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### - Problemas de Regressão"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MSE = 103.309686818\n",
      "RMSE = 10.1641372884\n",
      "R-squared = 0.0276391109678\n",
      "MAE = 8.14869190795\n",
      "Explained variance = 2.88839520172\n"
     ]
    }
   ],
   "source": [
    "from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD\n",
    "from pyspark.mllib.evaluation import RegressionMetrics\n",
    "from pyspark.mllib.linalg import DenseVector\n",
    "\n",
    "# Cria estrutura do tipo LabeledPoint\n",
    "def parsePoint(line):\n",
    "    values = line.split()\n",
    "    return LabeledPoint(float(values[0]),\n",
    "                        DenseVector([float(x.split(':')[1]) for x in values[1:]]))\n",
    "\n",
    "data = sc.textFile(\"data/sample_linear_regression_data.txt\")\n",
    "parsedData = data.map(parsePoint)\n",
    "\n",
    "# Constroi o modelo\n",
    "model = LinearRegressionWithSGD.train(parsedData)\n",
    "\n",
    "# Faz as predições\n",
    "valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))\n",
    "\n",
    "# Instantiate metrics object\n",
    "metrics = RegressionMetrics(valuesAndPreds)\n",
    "\n",
    "# Squared Error\n",
    "print(\"MSE = %s\" % metrics.meanSquaredError)\n",
    "print(\"RMSE = %s\" % metrics.rootMeanSquaredError)\n",
    "\n",
    "# R-squared\n",
    "print(\"R-squared = %s\" % metrics.r2)\n",
    "\n",
    "# Mean absolute error\n",
    "print(\"MAE = %s\" % metrics.meanAbsoluteError)\n",
    "\n",
    "# Explained variance\n",
    "print(\"Explained variance = %s\" % metrics.explainedVariance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Pyspark (Py 2)",
   "language": "",
   "name": "pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
